Testing primers against PR2
Testing primers against PR2
1 Goal
- Testing primers against the PR2 database (latest version 4.11.1)
2 Notes about script
- Chunks
compute_matchesandstore_matcheshave only to be run in 2 cases- new version of pr2
- new set of primers
- In the other case, they can be set eval=FALSE
2.1 To do
- Check why there are some negative amplicon (probably some matches ahead - Put a condition that amplicon should be >)
3 Initialize
Load the variables common to the different scripts and the necessary libraries
# Libraries for bioinfo ----------------------------------------------------
library("Biostrings")
# Libraries tidyr ---------------------------------------------------------
library("ggplot2")
library("dplyr")
library("readxl")
library("tibble")
library("readr")
library("purrr")
library("forcats")
library("lubridate")
library("stringr")
# Libraries dvutils and pr2database -------------------------------------------------------
if(any(grepl("package:dvutils", search()))) detach("package:dvutils", unload=TRUE)
library("dvutils")
if(any(grepl("package:pr2database", search()))) detach("package:pr2database", unload=TRUE)
library("pr2database")
# Options for knitting the report -------------
library(knitr)
library(rmdformats)
library(kableExtra)
knitr::opts_chunk$set(fig.width=8,
fig.height=6,
eval=TRUE,
cache=TRUE,
echo=TRUE,
prompt=FALSE,
tidy=TRUE,
comment=NA,
message=FALSE,
warning=FALSE)
opts_knit$set(width=90)
options(max.print="500")
options(knitr.kable.NA = '')4 Constants
5 Read primer file
5.1 Compute position on yeast - DO NOT RERUN
# Read from local database
pr2_db <- db_info("pr2_local")
pr2_db_con <- db_connect(pr2_db)
primers <- tbl(pr2_db_con, "pr2_primers") %>% collect()
disconnect <- db_disconnect(pr2_db_con)
ref_seq <- pr2 %>% filter(pr2_accession == "FU970071.1.1799_U")
# Matches the position of the primers on the yeast sequence - use map2_dfr to get an data
# frame on output and not a list - use ~ when defining the function
primers_pos <- map2_dfr(primers$sequence, primers$direction, ~get_primer_position(.x, ref_seq$sequence,
orientation = .y, mismatches = 3))
primers <- bind_cols(primers, primers_pos)
write_tsv(primers, path = "output/primers_matches.tsv", na = "")5.2 Build the primer set table 1
# Read from local database
pr2_db <- db_info("pr2_local")
pr2_db_con <- db_connect(pr2_db)
primers <- tbl(pr2_db_con, "pr2_primers") %>% collect()
primer_sets_all <- tbl(pr2_db_con, "pr2_primer_sets") %>% collect()
disconnect <- db_disconnect(pr2_db_con)
if (gene_selected == "18S_rRNA") {
gene_regions = c("V4", "V9")
# Just keep the selected primers (V4, V9 etc..) - Remove unpublished primers
primer_sets <- primer_sets_all %>% filter(gene == gene_selected) %>% filter(gene_region %in%
gene_regions) %>% filter(!(primer_set_id %in% c(12, 22, 26, 63:66, 77, 83)))
} else {
primer_sets <- primer_sets %>% filter(specificity == "plastid" | (gene == "18S_rRNA" &
specificity == "universal"))
}
primer_sets <- primer_sets %>% left_join(select(primers, primer_id, fwd_name = name, fwd_seq = sequence,
fwd_start_yeast = start_yeast, fwd_end_yeast = end_yeast), by = c(fwd_id = "primer_id")) %>%
left_join(select(primers, primer_id, rev_name = name, rev_seq = sequence, rev_start_yeast = start_yeast,
rev_end_yeast = end_yeast), by = c(rev_id = "primer_id")) %>% mutate(length_yeast = rev_end_yeast -
fwd_start_yeast + 1) %>% select(gene_region, specificity, primer_set_id, primer_set_name,
contains("fwd"), contains("rev"), length_yeast, used_in:remark) %>% select(-fwd_id, -rev_id) %>%
arrange(gene_region, fwd_start_yeast)
# write_tsv(primer_sets, path = 'output/primers_Table_1.tsv', na = '')
primer_sets_non_specific <- primer_sets %>% filter(is.na(specificity))
primer_sets_specific <- primer_sets %>% filter(!(is.na(specificity)))
knitr::kable(select(primer_sets, primer_set_id:primer_set_name, fwd_name, rev_name, length_yeast)) %>%
kableExtra::kable_styling()| primer_set_id | primer_set_name | fwd_name | rev_name | length_yeast |
|---|---|---|---|---|
| 40 | Zhan | Uni18SF | Uni18SR | 461 |
| 13 | Brate1 | 3NDf | V4_euk_R1 | 465 |
| 14 | Brate2 | 3NDf | V4_euk_R2 | 458 |
| 18 | Parfrey | 515F | 1119r | 598 |
| 33 | Needham | 515F Univ | 926R | 589 |
| 34 | Lambert | 515FY | NSR951 | 391 |
| 35 | UNonMet | EUK581-F | EUK1134-R | 578 |
| 4 | Hugerth_2 | 563f | 1132r | 587 |
| 7 | Bass | V4_1f | TAReukREV3 | 417 |
| 8 | Stoeck_V4_2 | TAReuk454FWD1 | TAReukREV3 | 417 |
| 16 | Piredda_V4 | TAReuk454FWD1 | V4 18S Next.Rev | 417 |
| 19 | Vannini | Claudia Vannini (F) | Claudia Vannini (R) | 439 |
| 36 | Stoeck_V4_1 | TAReuk454FWD1 | V4RB | 417 |
| 1 | Hadziavdic_1 | F-566 | R-1200 | 635 |
| 15 | Moreno | EUKAF | EUKAR | 410 |
| 17 | Comeau | E572F | E1009R | 438 |
| 25 | Mangot | NSF563 | NSR951 | 380 |
| 2 | Hadziavdic_2 | A-528F | R-952 | 379 |
| 39 | Egge | A-528F | PRYM01+7 | 396 |
| 3 | Hugerth_1 | 574*f | 1132r | 576 |
| 23 | Venter | 590F | 1300R | 720 |
| 21 | Zimmerman | D512for | D978rev | 437 |
| 41 | Harder | Cerc479F | Cerc750R | 283 |
| 24 | Simon | EK-565F-NGS | EUK1134-R | 527 |
| 5 | Hugerth_3 | 616f | 1132r | 534 |
| 6 | Hugerth_4 | 616*f | 1132r | 534 |
| 28 | Amaral_1 | 1380F | 1510R | 176 |
| 29 | Amaral_2 | 1389F | 1510R | 167 |
| 31 | Piredda_V9 | 1388F | 1510R | 167 |
| 27 | Stoeck_V9 | 1391F | EukB | 169 |
6 Computing the matches
This part is done by an R script PR2 Primers pr2_match.R that is executed in batch mode.
6.1 Programing Notes
- Use Biostrings
Accessor methods : In the code snippets below, x is an MIndex object.
- length(x): The number of patterns that matches are stored for.
- names(x): The names of the patterns that matches are stored for.
- startIndex(x): A list containing the starting positions of the matches for each pattern.
- endIndex(x): A list containing the ending positions of the matches for each pattern.
- elementNROWS(x): An integer vector containing the number of matches for each pattern.
- x[[i]]: Extract the matches for the i-th pattern as an IRanges object.
- unlist(x, recursive=TRUE, use.names=TRUE): Return all the matches in a single IRanges object. recursive and use.names are ignored.
- extractAllMatches(subject, mindex): Return all the matches in a single XStringViews object.
One could also use another function which does not give the position * match_fwd <- vcountPattern(fwd, seq,max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=FALSE, algorithm=“auto”)
7 Load the data file
This avoids recomputing each time.
All sequences for which introns have been removed are filtered out (contain "_UC" in pr2_accession)
load(file = str_c("output/pr2_match_", gene_selected, ".rda"))
print(str_c("Before filtration: ", nrow(pr2_match_final)))[1] "Before filtration: 5112870"
pr2_match_final_filtered <- pr2_match_final %>% # Remove sequences for which the introns have been removed
filter(!str_detect(pr2_accession, "_UC")) %>% # Remove sequence that are shorter
filter((gene_region == "V4" & sequence_length >= sequence_length_min_V4) | (gene_region ==
"V9" & sequence_length >= sequence_length_min_V9) | (!(gene_region %in% c("V4", "V9") &
sequence_length >= sequence_length_min))) %>% # remove '_' from primer_labels
mutate(primer_label = str_replace_all(primer_label, "_", " ")) %>% # Only keep the selected primers
filter(primer_set_id %in% primer_sets$primer_set_id)
print(str_c("After filtration: ", nrow(pr2_match_final_filtered)))[1] "After filtration: 2674721"
8 Summarize the data in tables
8.1 Summarize all eukaryotes
pr2_match_summary_primer_set <- pr2_match_final_filtered %>% group_by(gene_region, primer_label,
primer_set_id) %>% summarize(pct_fwd = sum(!is.na(fwd_pos))/n() * 100, pct_rev = sum(!is.na(rev_pos))/n() *
100, pct_amplicons = sum(!is.na(ampli_size))/n() * 100, ampli_size_mean = mean(ampli_size,
na.rm = TRUE), ampli_size_sd = sd(ampli_size, na.rm = TRUE), ampli_size_max = max(ampli_size,
na.rm = TRUE), ampli_size_min = min(ampli_size, na.rm = TRUE), n_seq = n())
# write_tsv(pr2_match_summary_primer_set,
# str_c('output/pr2_match_summary_primer_set_',gene_selected,'.tsv'), na='')
# Long form for Number of sequences
# This dataframe is used to re-order the bars correctly
pct_category_order <- data.frame(pct_category = c("pct_amplicons", "pct_fwd", "pct_rev"), pct_category_order = c(1,
3, 2))
pr2_match_summary_primer_set_long <- pr2_match_summary_primer_set %>% tidyr::gather("pct_category",
"pct_seq", pct_fwd:pct_amplicons) %>% left_join(pct_category_order)8.2 Summarize per supergroup
pr2_match_summary_primer_set_sg <- pr2_match_final_filtered %>% group_by(supergroup, gene_region,
primer_label, primer_set_id) %>% summarize(pct_fwd = sum(!is.na(fwd_pos))/n() * 100, pct_rev = sum(!is.na(rev_pos))/n() *
100, pct_amplicons = sum(!is.na(ampli_size))/n() * 100, ampli_size_mean = case_when(pct_amplicons >
0 ~ mean(ampli_size, na.rm = TRUE)), ampli_size_sd = case_when(pct_amplicons > 0 ~ sd(ampli_size,
na.rm = TRUE)), ampli_size_max = case_when(pct_amplicons > 0 ~ max(ampli_size, na.rm = TRUE)),
ampli_size_min = case_when(pct_amplicons > 0 ~ min(ampli_size, na.rm = TRUE)), n_seq = n()) %>%
ungroup()
# write_tsv( pr2_match_summary_primer_set_sg,
# str_c('output/pr2_match_summary_primer_set_',gene_selected,'_per_sg.tsv'), na='')8.3 Summarize per class
pr2_match_summary_primer_set_class <- pr2_match_final_filtered %>% group_by(supergroup, division,
class, gene_region, primer_label, primer_set_id) %>% summarize(pct_fwd = sum(!is.na(fwd_pos))/n() *
100, pct_rev = sum(!is.na(rev_pos))/n() * 100, pct_amplicons = sum(!is.na(ampli_size))/n() *
100, ampli_size_mean = case_when(pct_amplicons > 0 ~ mean(ampli_size, na.rm = TRUE)), ampli_size_sd = case_when(pct_amplicons >
0 ~ sd(ampli_size, na.rm = TRUE)), ampli_size_max = case_when(pct_amplicons > 0 ~ max(ampli_size,
na.rm = TRUE)), ampli_size_min = case_when(pct_amplicons > 0 ~ min(ampli_size, na.rm = TRUE)),
n_seq = n())
# write_tsv(pr2_match_summary_primer_set_class,
# str_c('output/pr2_match_summary_primer_set_',gene_selected,'_per_class.tsv'), na='')9 Amplicon length
9.1 Average size
ggplot(pr2_match_final, aes(x = primer_label, y = ampli_size, group = primer_set_id)) + geom_boxplot(outlier.alpha = 0.3) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + xlab("Primer set")9.2 Plot an example of amplicon distribution (Fig. 3)
fig3A <- list()
fig3B <- list()
for (one_primer_set in c(4, 29)) {
if (one_primer_set == 4) {
xmin = 570
xmax = 610
xmax2 = 2000
} else {
xmin = 130
xmax = 190
xmax2 = 1000
}
pr2_match_final_one <- pr2_match_final %>% filter(primer_set_id == one_primer_set) %>%
filter(!(supergroup %in% c("Apusozoa", "Eukaryota_X", "Protoalveoalata")))
primer_label <- str_replace_all(pr2_match_final_one$primer_label[1], "_", " ")
g <- ggplot(pr2_match_final_one, aes(x = ampli_size)) + geom_density(fill = "blue", alpha = 0.9) +
xlab("Amplicon size") + ggtitle(str_c("Primer set - ", primer_label)) + xlim(xmin,
xmax)
print(g)
fig3A[[one_primer_set]] <- ggplot(pr2_match_final_one, aes(x = ampli_size, fill = supergroup)) +
geom_density(alpha = 0.9) + theme_bw() + theme(legend.position = "none") + scale_fill_viridis_d(option = "inferno") +
xlab("Amplicon size (bp)") + ylab("Density") + ggtitle(str_c("Primer set - ", primer_label)) +
xlim(xmin, xmax)
print(fig3A[[one_primer_set]])
fig3B[[one_primer_set]] <- ggplot(pr2_match_final_one, aes(x = supergroup, y = ampli_size)) +
geom_boxplot(outlier.alpha = 0.3) + theme_bw() + coord_flip() + # theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
xlab("Supergroup") + ylab("Amplicon size (bp)") + ylim(0, xmax2)
print(fig3B[[one_primer_set]])
g <- ggplot(pr2_match_final_one, aes(x = supergroup, y = ampli_size)) + geom_violin() +
coord_flip() + # theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
xlab("Supergroup") + ylab("Amplicon size (bp)")
print(g)
}9.3 Plot an example of amplicon size distribution
fig3C <- list()
for (one_primer_set in c(4, 29)) {
pr2_match_summary_filtered <- pr2_match_summary_primer_set_sg %>% filter((n_seq > 20) &
(primer_set_id == one_primer_set) & !(supergroup %in% c("Apusozoa", "Eukaryota_X")))
fig3C[[one_primer_set]] <- ggplot(pr2_match_summary_filtered) + geom_col(data = pr2_match_summary_filtered,
aes(x = str_c(supergroup, " - n = ", n_seq), y = pct_amplicons, fill = supergroup),
position = "dodge") + theme_bw() + theme(legend.position = "none") + scale_fill_viridis_d(option = "inferno") +
coord_flip() + ylab("% of sequences amplified") + xlab("") # +
# ggtitle (str_c('Set - ', one_primer_set, ' - % amplified per Supergroup') )
print(fig3C[[one_primer_set]])
g <- ggplot(filter(pr2_match_summary_filtered, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(supergroup,
" - n = ", n_seq), y = ampli_size_mean), colour = "black") + coord_flip() + geom_errorbar(aes(x = str_c(supergroup,
" - n = ", n_seq), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean -
ampli_size_sd)) + xlab("Supergroup") + ylab("Amplicon size (bp)") # +
# scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) + ggtitle (str_c('Set - ',
# one_primer_set, ' - Amplicon sizes') ) + geom_hline(yintercept = c(450,550) , linetype=
# 2)
print(g)
}10 Graphics
10.1 All Eukaryotes
Comments
- Percent of sequences amplified
- Logically, the % of seq amplified is always < min(% of sequences matching forwar, % of sequences matching reverse)
- In general it is the reverse primer that causes problems.
- Some primer sets do not amplify any sequence (11, 19, 20, 21)
- For V9, primer set 30 reverse primer is in the ITS region which is not present in the PR2 sequences, so no amplification.
- Amplicon sizes
- Only 8 V4 primer sets are suitable for Illumina 2x250 (max amplicon size = 450 bp)
- This is if we consider the mean… If we consider the variation around the mean then, only 3 suitable for 2x250
- Five more V4 primer sets are suitable for Illumina 3x250 (max amplicon size = 550 bp)
fig1 <- list()
for (one_region in gene_regions) {
pr2_match_summary_primer_set_region_long <- pr2_match_summary_primer_set_long %>% filter(gene_region ==
one_region) %>% # Remove the group specific primers
filter(!(primer_set_id %in% c(35, 19, 39, 21, 41)))
pr2_match_summary_primer_set_region <- pr2_match_summary_primer_set %>% filter(gene_region ==
one_region) %>% # Remove the group specific primers)
filter(!(primer_set_id %in% c(35, 19, 39, 21, 41)))
pr2_match_region <- pr2_match_final %>% filter(gene_region == one_region) %>% # Remove the group specific primers
filter(!(primer_set_id %in% c(35, 19, 39, 21, 41)))
g <- ggplot(pr2_match_summary_primer_set_region_long) + geom_col(aes(x = reorder(primer_label,
-primer_set_id), y = pct_seq, fill = fct_reorder(pct_category, pct_category_order)),
width = 0.7, position = "dodge") + theme_bw() + xlab("Primer set") + ylab("% of sequences amplified") +
scale_fill_manual(name = "", values = c(pct_amplicons = "black", pct_fwd = "grey80",
pct_rev = "grey40"), labels = c("Amplicons", "Primer rev", "Primer fwd")) + ggtitle(str_c(one_region,
" - Percentage of sequences recovered")) + theme(axis.text.x = element_text(angle = 0,
hjust = 1)) + ylim(0, 100) + coord_flip() + guides(fill = guide_legend(nrow = 1)) +
theme(legend.position = "top", legend.box = "horizontal")
print(g)
fig1[[str_c(one_region, "pct", sep = " ")]] <- g
g <- ggplot(filter(pr2_match_summary_primer_set_region, !is.nan(ampli_size_mean))) + geom_point(aes(x = reorder(primer_label,
-primer_set_id), y = ampli_size_mean), colour = "black") + geom_errorbar(aes(x = reorder(primer_label,
primer_set_id), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) +
theme_bw() + xlab("Primer set") + ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
ggtitle(str_c(one_region, " - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively")) +
geom_hline(yintercept = c(450, 550), linetype = c(2, 3)) + ylim(0, 850) + theme(axis.text.x = element_text(angle = 0,
hjust = 1)) + coord_flip()
print(g)
fig1[[str_c(one_region, "size", sep = " ")]] <- g
# g <- ggplot(pr2_match_region) + geom_boxplot(aes(x=reorder(primer_label, primer_set_id),
# y=ampli_size), colour='black', outlier.alpha = 0.3) + theme_bw() + xlab('Primer set') +
# ylab('Amplicon size (bp)') + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500))
# + ggtitle (str_c(one_region, ' - Amplicon size - Lines correspond to limits for Illumina
# 2x250 and 2x300 respectively') ) + geom_hline(yintercept = c(450,550), linetype = c(2,3)
# )+ theme(axis.text.x = element_text(angle = 45, hjust = 1)) print(g)
}10.2 By supergroup
Comments
- Excavata have a very different patterns from the rest of the group. They are not amplified by quite a few primer sets. They have also bigger amplicons
- Some groups exhibit higher variability in amplicon size (e.g Chlorophyta)
for (specific_set in c(FALSE, TRUE)) {
pr2_match_summary_primer_set_sg_region <- pr2_match_summary_primer_set_sg %>% filter((n_seq >
20)) %>% filter(!(supergroup %in% c("Apusozoa", "Eukaryota_X")))
nrow = 5
if (specific_set) {
pr2_match_summary_primer_set_sg_region <- pr2_match_summary_primer_set_sg %>% filter((n_seq >
20)) %>% filter((primer_set_id %in% c(6, 13, 34, 25, 33, 16))) %>% filter(!(supergroup %in%
c("Apusozoa", "Eukaryota_X")))
nrow = 1
}
g <- ggplot(pr2_match_summary_primer_set_sg_region) + geom_col(aes(x = supergroup, y = pct_amplicons,
fill = supergroup), position = "dodge") + theme_bw() + coord_flip() + ylab("% of sequences amplified") +
xlab("Supergroup") + # ggtitle ('% amplified per Supergroup' ) +
scale_fill_viridis_d(option = "inferno") + ylim(0, 100) + facet_wrap(~primer_label, scales = "fixed",
nrow = nrow) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top",
legend.box = "horizontal") + theme(legend.position = "none")
print(g)
list_label <- str_c("pct", case_when(specific_set ~ "specific", TRUE ~ ""), sep = " ")
fig_supergroup[[list_label]] <- g
g <- ggplot(filter(pr2_match_summary_primer_set_sg_region, !is.nan(ampli_size_mean))) +
geom_point(aes(x = supergroup, y = ampli_size_mean), colour = "black") + theme_bw() +
coord_flip() + geom_errorbar(aes(x = supergroup, ymax = ampli_size_mean + ampli_size_sd,
ymin = ampli_size_mean - ampli_size_sd)) + xlab("Supergroup") + ylab("Amplicon size (bp)") +
# ggtitle (str_c(' - Amplicon size - Lines correspond to limits for Illumina 2x250 and
# 2x300 respectively') ) +
geom_hline(yintercept = c(450, 550), linetype = 2) + facet_wrap(~primer_label, scales = "fixed",
nrow = nrow)
print(g)
list_label <- str_c("size", case_when(specific_set ~ "specific", TRUE ~ ""), sep = " ")
fig_supergroup[[list_label]] <- g
}10.3 By class for autotrophs
fig_class <- list()
for (specific_set in c(FALSE, TRUE)) {
pr2_match_summary_filtered <- filter(pr2_match_summary_primer_set_class, (n_seq > 20) &
(division %in% c("Haptophyta", "Dinoflagellata", "Chlorophyta", "Ochrophyta", "Cryptophyta")))
nrow = 5
if (specific_set) {
pr2_match_summary_filtered <- filter(pr2_match_summary_primer_set_class, (n_seq > 20) &
(division %in% c("Haptophyta", "Dinoflagellata", "Chlorophyta", "Ochrophyta", "Cryptophyta"))) %>%
filter((primer_set_id %in% c(16, 21)))
nrow = 1
}
if (nrow(pr2_match_summary_filtered) > 0) {
g <- ggplot(pr2_match_summary_filtered) + geom_col(data = pr2_match_summary_filtered,
aes(x = str_c(str_trunc(division, 20, ellipsis = ""), "-", class), y = pct_amplicons,
fill = class), position = "dodge") + theme_bw() + coord_flip() + ylab("% of sequences amplified") +
xlab("Class") + scale_fill_viridis_d(option = "inferno") + ylim(0, 100) + facet_wrap(~primer_label,
scales = "fixed", nrow = nrow) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top",
legend.box = "horizontal") + theme(legend.position = "none")
print(g)
list_label <- str_c("pct", case_when(specific_set ~ "specific", TRUE ~ ""), sep = " ")
fig_class[[list_label]] <- g
g <- ggplot(filter(pr2_match_summary_filtered, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(str_trunc(division,
3, ellipsis = ""), "-", class), y = ampli_size_mean), colour = "black") + theme_bw() +
coord_flip() + geom_errorbar(aes(x = str_c(str_trunc(division, 3, ellipsis = ""),
"-", class), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) +
xlab("Class") + ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
ggtitle(str_c("Set -", one_primer_set, " - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively")) +
geom_hline(yintercept = c(450, 550), linetype = 2) + facet_wrap(~primer_label,
scales = "fixed", nrow = nrow)
print(g)
}
}11 Specific analyses
11.1 Specific et sets for Opisthokonta
- 35 UnNonMet
- 16 Piredda
- 17 Comeau
for (one_primer_set in c(16, 17, 35)) {
pr2_match_summary_filtered <- filter(pr2_match_summary_primer_set_class, (n_seq > 20) &
(supergroup %in% c("Opisthokonta")) & (primer_set_id == one_primer_set))
g <- ggplot(pr2_match_summary_filtered) + geom_col(data = pr2_match_summary_filtered, aes(x = str_c(division,
"-", class, " - n= ", n_seq), y = pct_amplicons), fill = "grey", position = "dodge") +
theme_bw() + coord_flip() + ylab("% of sequences amplified") + xlab("Class") + ggtitle(str_c("Set -",
one_primer_set, " - % amplified per Class"))
print(g)
g <- ggplot(filter(pr2_match_summary_filtered, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(division,
"-", class, " - n= ", n_seq), y = ampli_size_mean), colour = "black") + coord_flip() +
geom_errorbar(aes(x = str_c(division, "-", class, " - n= ", n_seq), ymax = ampli_size_mean +
ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) + xlab("Class") + ylab("Amplicon size (bp)") +
# scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
ggtitle(str_c("Set -", one_primer_set, " - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively")) +
geom_hline(yintercept = c(450, 550), linetype = 2)
print(g)
}12 Tables
12.1 Table 1 and Supplementary tables
if (gene_selected == "18S_rRNA") file_name_xls = "output/primers_18S.xlsx"
excel_style <- openxlsx::createStyle(numFmt = "0.00")
openxlsx::write.xlsx(list(select(primer_sets_non_specific, -primer_set_name, -specificity),
select(primer_sets_specific, -primer_set_name), pr2_match_summary_primer_set, pr2_match_summary_primer_set_sg,
pr2_match_summary_primer_set_class), file = file_name_xls, sheetName = c("Table 1", "Specific primer sets",
"Summary Eukaryotes", "Summary Supergroup", "Summary Class"), zoom = 80, firstActiveRow = 2,
firstActiveRow = 6, colWidths = "auto")13 Figures
13.1 Fig. 1 - V4 and V9 - Amplification % and size
legend <- cowplot::get_legend( fig1[["V4 pct"]] +
# create some space to the left of the legend
theme(legend.box.margin = margin(0, 0, 0, 20))
)
fig_1 <- cowplot::plot_grid(legend, NULL,
fig1[["V4 pct"]] +ggtitle("")+ theme(legend.position="none") + ylab(""),
fig1[["V4 size"]] +ggtitle("") + xlab("") + ylab(""),
fig1[["V9 pct"]] +ggtitle("")+ theme(legend.position="none"),
fig1[["V9 size"]] +ggtitle("") + xlab(""),
labels = c("","","A","", "B",""), label_size = 22, label_x = 0.05,
ncol = 2, nrow = 3, align = "hv", rel_heights = c(1,20,6) )
fig_1
ggsave(plot= fig_1 , filename="figs/fig_1_pct_sizes_V4_V9.pdf",
width = 12 , height = 12, scale=1.75, units="cm", useDingbats=FALSE)13.2 Fig. 2 - Example of V4 and V9
# g.fig3A <- ggplotGrob(fig3A) # convert to gtable g.fig3B <- ggplotGrob(fig3B) # convert
# to gtable fig3A.widths <- g.fig3A$widths[1:3] # extract the first three widths, #
# corresponding to left margin, y lab, and y axis fig3B.widths <- g.fig3B$widths[1:3] #
# same for mpg plot max.widths <- grid::unit.pmax(fig3A.widths, fig3B.widths) # calculate
# maximum widths g.fig3A$widths[1:3] <- max.widths # assign max. widths to iris gtable
# g.fig3B$widths[1:3] <- max.widths # assign max widths to mpg gtable
fig_2 <- cowplot::plot_grid(fig3A[[4]], fig3B[[4]], fig3C[[4]], fig3A[[29]], fig3B[[29]], fig3C[[29]],
labels = c("A", "", "", "B", "", ""), ncol = 3, nrow = 2, align = "h")
fig_2
ggsave(plot = fig_2, filename = "figs/fig_2_examples_V4_V9.pdf", width = 13, height = 7, scale = 2.3,
units = "cm", useDingbats = FALSE)13.3 Fig. 3 - Supergroup analysis
fig_3 <- cowplot::plot_grid(fig_supergroup[["pct specific"]], NULL, fig_supergroup[["size specific"]],
NULL, labels = c("A", "", "B", ""), ncol = 2, nrow = 2, align = "v", rel_widths = c(13,
0.2))
fig_3
ggsave(plot = fig_3, filename = "figs/fig_3_supergroup.pdf", width = 14, height = 7, scale = 1.75,
units = "cm", useDingbats = FALSE)13.4 Fig. 4 - Class analysis
13.5 Fig. Suppl -Supergroup ss analysis
fig_sup_supergroup <- cowplot::plot_grid(fig_supergroup[["pct "]], NULL, fig_supergroup[["size "]],
NULL, labels = c("A", "", "B", ""), ncol = 2, nrow = 2, align = "v", rel_widths = c(13,
0.2))
fig_sup_supergroup
ggsave(plot = fig_sup_supergroup, filename = "figs/fig_sup_supergroup.pdf", width = 14, height = 20,
scale = 2.5, units = "cm", useDingbats = FALSE)